# R Options
options(stringsAsFactors=FALSE)
# Required libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(patchwork)
library(openxlsx)
library(ggpubr)
library(ggsci)
library(knitr)
library(kableExtra)
library(Seurat)
library(purrr)
library(DT)
# Source plotting functions
source("R/functions_io.R")
source("R/functions_plotting.R")
source("R/functions_analysis.R")
source("R/functions_util.R")
# default options
knitr::opts_chunk$set(echo=TRUE, cache=FALSE, message=FALSE, warning=FALSE, fig.width=10)
options(stringsAsFactors = F)Open this code chunk to read all parameters that are set specifically for your project.
param = list()
# Project ID
param$project = "HTO_testDataset"
# Input data path in case Cell Ranger was run
param$path.data = "testDatasets/10x_pbmc_hto_GSE108313/cellranger"
# Output path
param$path.out = "testDatasets/10x_pbmc_hto_GSE108313/results"
if(!file.exists(param$path.out)) dir.create(param$path.out)
# HTO names
# HTOs have an ID that is included in the 'features.tsv' input file
# We additionally ask for readable names that are used throughout this report
# This could look like this, where HTO1-3 are the IDs included raw dataset
# param$hto.names = setNames(c("NameA", "NameB", "NameC"), c("HTO1", "HTO2", "HTO3"))
param$hto.names = setNames(c("htoA","htoB","htoC","htoD","htoE","htoF","htoG","htoH"),
c("htoA","htoB","htoC","htoD","htoE","htoF","htoG","htoH")) # Name equals ID in this test case
# Prefix of mitochondrial genes
param$mt = "^MT-"
# Main color to use for plots
param$col = "palevioletred"
# Sample data to at most n cells (mainly for tests); set to NULL to deactivate
param$sample_cells = 2000In this first section of the report, we read 10X data from the files produced by CellRanger:
and setup a Seurat object. This object includes different data types in separate assays:
Note that ‘Antibody Capture’ features can correspond to ‘HTO’ and ‘ADT’ and can be distinguised based on provided HTO names.
# Load the dataset with its assays and create a Seurat object; pass hto_names so that HTO will be a separate assay
sc = read_10x_dataset(param$path.data,hto_names = param$hto.names)
# If requested: sample at most n cells
if(!is.null(param$sample_cells)){
sampled.barcodes = sample(Cells(sc),min(param$sample_cells,length(Cells(sc))))
sc = subset(sc,cells = sampled.barcodes)
}
# Check for cells in other assays which have no HTO counts
original.assay.names = assay.names = setdiff(Assays(sc),"HTO")
assay_cells.hto = map(assay.names,function(a){
list(with = intersect(Cells(sc[[a]]),Cells(sc[["HTO"]])),
without = setdiff(Cells(sc[[a]]),Cells(sc[["HTO"]])))
})
names(assay_cells.hto) = assay.names
joint.barcodes = unique(flatten_chr(map(assay_cells.hto,function(a){a[["with"]]})))
# Subset Seurat object to all cells with HTO
sc = subset(sc, cells=joint.barcodes)
sc## An object of class Seurat
## 21863 features across 2000 samples within 2 assays
## Active assay: RNA (21855 features)
## 1 other assay present: HTO
summary = map_dfr(assay.names, function(a){
data.frame(Assay = a,
CellsTotal = length(assay_cells.hto[[a]][["with"]]) + length(assay_cells.hto[[a]][["without"]]),
CellsWithHtoReads = length(assay_cells.hto[[a]][["with"]]),
CellsWithoutHtoReads = length(assay_cells.hto[[a]][["without"]]))
})
kable(summary, align="l", caption="Dataset summary") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")| Assay | CellsTotal | CellsWithHtoReads | CellsWithoutHtoReads |
|---|---|---|---|
| RNA | 2000 | 2000 | 0 |
This section of the report shows how cells are assigned to their sample-of-origin.
We start the analysis by normalising raw HTO counts. HTO counts for each cell are divided by the total counts for that cell and multiplied by 10,000. This is then natural-log transformed.
We assign cells to their sample-of-origin, annotate negative cells that cannot be assigned to any sample, and doublet cells that are assigned to two samples.
# Demultiplex HTOs
sc = HTODemux(sc, assay = "HTO", positive.quantile = 0.99)
# Sort Idents levels for nicer plotting
Idents(sc) = factor(Idents(sc), levels=hto.levels)
sc@meta.data$hash.ID = factor(sc@meta.data$hash.ID, levels=hto.levels)
# HTO classification results
hash.ID.table = sc@meta.data %>% dplyr::count(hash.ID) %>% dplyr::rename(HTO=hash.ID) %>% mutate(Perc=round(n/sum(n)*100,2)) %>% as.data.frame
p1 = ggplot(sc@meta.data %>% dplyr::count(HTO_classification.global), aes(x="", y=n, fill=HTO_classification.global)) +
geom_bar(width=1, stat="identity") + coord_polar("y", start=0) +
xlab("") + ylab("")
p1 = plot.mystyle(p=p1, title="HTO global classification results", col=param$col.hto.global)
p2 = ggplot(sc@meta.data %>% dplyr::count(hash.ID), aes(x="", y=n, fill=hash.ID)) +
geom_bar(width=1, stat="identity") + coord_polar("y", start=0) +
xlab("") + ylab("")
p2 = plot.mystyle(p=p2, title="HTO classification results", col=param$col.hto.collapsed)
p1 + p2 + plot_annotation(title="HTO classification results") + gridExtra::tableGrob(hash.ID.table, rows=NULL)This section of the report visualises raw and normalised HTO data to understand whether the demultiplexing step has worked well.
# Distribution of HTO counts before and after normalisation
hto.t.raw = sc@assays$HTO@counts %>% as.data.frame %>% t %>% as.data.frame
hto.t.raw.pseudo = hto.t.raw + 1
hto.t.norm = sc@assays$HTO@data %>% as.data.frame %>% t %>% as.data.frame
p1 = ggplot(hto.t.raw.pseudo %>% pivot_longer(everything()), aes(x=name, y=value, fill=name)) + geom_violin() + scale_y_continuous(trans="log2") + xlab("") + ylab("")
p1 = plot.mystyle(p1, title="HTO raw counts", col=param$col.hto.collapsed, legend.title="HTO")
p2 = ggplot(hto.t.norm %>% pivot_longer(everything()), aes(x=name, y=value, fill=name)) + geom_violin() + xlab("") + ylab("")
p2 = plot.mystyle(p2, title="HTO normalised counts", col=param$col.hto.collapsed, legend.title="HTO")
p = p1 + p2 & theme(legend.position="bottom")
p + plot_annotation("HTO counts before and after normalisation") + plot_layout(guides = "collect")Pairs of raw (left tab) and normalised (right tab) HTO counts are visualised to confirm mutal exclusivity in singlet cells. Data points correspond to measured HTO counts per HTO, colours correspond to the assigned samples-of-origin.
n = df.all.col.combinations(x=hto.t.raw.pseudo, cell.classification=sc$hash.ID)
# plot
p = ggplot(n, aes(x=value1, y=value2, color=cell.classification)) + geom_point() +
scale_x_continuous(trans="log2") + scale_y_continuous(trans="log2") +
scale_color_manual(values=param$col.hto.collapsed)
p = plot.mystyle(p)
p = p + facet_grid(name2~name1, drop=FALSE) +
theme(axis.title.x = element_blank(),strip.text.x = element_text(size=10, color="black"),
axis.title.y = element_blank(),strip.text.y = element_text(size=10, color="black"),
strip.background = element_rect(colour="white", fill="lightgrey"),
legend.position="bottom",
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.5)) +
plot_annotation("Raw HTO counts")
pn = df.all.col.combinations(x=hto.t.norm, cell.classification=sc$hash.ID)
p = ggplot(n, aes(x=value1, y=value2, color=cell.classification)) + geom_point() +
scale_x_continuous(trans="log2") + scale_y_continuous(trans="log2") +
scale_color_manual(values=param$col.hto.collapsed)
p = plot.mystyle(p)
p = p + facet_grid(name2~name1, drop=FALSE) +
theme(axis.title.x = element_blank(),strip.text.x = element_text(size=10, color="black"),
axis.title.y = element_blank(),strip.text.y = element_text(size=10, color="black"),
strip.background = element_rect(colour="white", fill="lightgrey"),
legend.position="bottom") +
plot_annotation("Normalised HTO data")
pThe following ridge plots visualise the enrichment of assigned sample-of-origin for the respective normalised HTO counts.
# Group cells based on HTO classification
p = RidgePlot(sc, assay="HTO", features=rownames(sc@assays$HTO), same.y.lims=TRUE, cols=param$col.hto.collapsed, combine=FALSE)
for(i in 1:length(p)) p[[i]] = plot.mystyle(p[[i]], legend.title="Classified cells")
p = wrap_plots(p, ncol = 2) + plot_annotation("Normalised HTO data")+ plot_layout(guides = "collect") & theme(legend.position="bottom")
pLastly, we compare the number of features between classified cells.
# Number of features in the different cells
nfeature.metrics = grep("_HTO",grep("nFeature_",colnames(sc[[]]),v = T),v = T,invert = T)
p = VlnPlot(sc, features = nfeature.metrics, idents=levels(Idents(sc)), ncol=3, pt.size=0) + xlab("") + geom_violin(color=NA)
p = plot.mystyle(p, title="Number of features for HTO-classified cells", col=param$col.hto.collapsed, legend.title="Classified cells")
pThis section of the report states the number of cells that remain after negative and doublet cells are removed.
## An object of class Seurat
## 21863 features across 1603 samples within 2 assays
## Active assay: RNA (21855 features)
## 1 other assay present: HTO
This section of the report provides first insights into your RNA dataset based on a preliminary pre-processing of the RNA data using the standard scRNA-seq workflow.
sc.all = prelim.analysis(sc = sc.all, mt = param$mt, pc.n = 10)
sc = prelim.analysis(sc = sc, mt = param$mt, pc.n = 10)We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the assigned sample-of-origin.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see https://pair-code.github.io/understanding-umap/.
Finally, demultiplexed RNA data are written back to file.
# Create a directory for the demultiplexed data
demux.base_dir = file.path(param$path.out,"demultiplexed_samples")
dir.create(demux.base_dir, showWarnings=FALSE)
demux.samples_paths = c()
# Save each sample in a separate directory
samples = levels(Idents(sc))
for(s in samples){
p = export_seurat_assay_data_to_dir(sc[,Idents(sc)==s], dir=file.path(demux.base_dir,s),
assays=original.assay.names, slot="counts",
include_cell_metadata_cols=c("HTO_maxID","HTO_classification","HTO_classification.global", "hash.ID"))
demux.samples_paths = c(demux.samples_paths, p)
}
print(demux.samples_paths)## [1] "testDatasets/10x_pbmc_hto_GSE108313/results/demultiplexed_samples/htoH"
## [2] "testDatasets/10x_pbmc_hto_GSE108313/results/demultiplexed_samples/htoG"
## [3] "testDatasets/10x_pbmc_hto_GSE108313/results/demultiplexed_samples/htoF"
## [4] "testDatasets/10x_pbmc_hto_GSE108313/results/demultiplexed_samples/htoE"
## [5] "testDatasets/10x_pbmc_hto_GSE108313/results/demultiplexed_samples/htoD"
## [6] "testDatasets/10x_pbmc_hto_GSE108313/results/demultiplexed_samples/htoC"
## [7] "testDatasets/10x_pbmc_hto_GSE108313/results/demultiplexed_samples/htoB"
## [8] "testDatasets/10x_pbmc_hto_GSE108313/results/demultiplexed_samples/htoA"